In this notebook a predictive model is built in order to identify what clients are likely to churn in the following month.
The data used for this model is the already processed file: finaldataframe[date]_[version].csv which is built with edp_churn_data_pipeline_revised.ipynb using as sources:
Previous feature engineering is performed to include spatial features to our model describing the churners behavior in a client's neighborhood. Some examples of these features are:
This notebook shows the main steps followed when training the model. It is organized by iterations. This notebook contains the iterations that contributed the most to the model's performance improvement:
Iteration 0: Baseline model
A baseline model in built considering only features created using customer profile data and a XGBClassifier algorithm with default hyperparameters.
Iteration 1: Weights to observations and bad quality predictors removed
Recent churn events are given more weight than older ones. In addition, features with a strong time dependence are either removed or detrended.
Iteration 2: Introduction of spatial features + Recursive Feature Elimination
Spatial features are included into the model training.
We apply advanced techniques to identify the 25 monst impactfull features. This is done with two goals:
Iteration 3: Algorithm optimization
Hyperparameter tuning to find the optimal ones. Results obtained are explained in further detail.
In addition, two spatial analysis are performed to explain the differences in the model's performance throughout Portugal identified thanks to results visualization with CARTOFrames:
#!pip install cartoframes==1.0b2
#!pip install geopandas
#!pip install pysal==1.14.4 # versions >= 2.0.0 won't work in Colab
#!pip install shap
import geopandas
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)
import pysal
import seaborn as sns
import shap
from cartoframes.auth import Credentials, set_default_credentials
from cartoframes.client import SQLClient
from cartoframes.data import Dataset
from cartoframes.viz import Map, Layer, basemaps
from cartoframes.viz.helpers import color_bins_layer
from google.colab import drive
drive.mount('/gdrive')
from inspect import signature
from shapely import wkt
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, auc, average_precision_score,classification_report, confusion_matrix, f1_score, precision_recall_curve, \
precision_score, recall_score, roc_curve
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from yellowbrick.model_selection import ValidationCurve
shap.initjs()
sns.set()
my_base_url = 'https://malvarez-carto.carto.com/'
my_api_key=''
cred = Credentials(base_url=my_base_url,
api_key=my_api_key)
set_default_credentials(
base_url=my_base_url,
api_key=my_api_key
)
sql_client = SQLClient(cred)
run_date = 201901
df_final = pd.read_csv('/gdrive/My Drive/final_dataframe_201901_new_MODIFIED.csv')
df_final.head()
#### SOCIODEMOG
query = """
SELECT
*,
ST_AsText(the_geom) as geometry
FROM edp_cp_demographics_agg
"""
df_demog = Dataset(query).download()
df_demog['postcode'] = df_demog['postcode'].astype(int)
demog = geopandas.GeoDataFrame(df_demog)
demog['geometry'] = demog['geometry'].apply(wkt.loads)
demog.crs = {'init': 'EPSG:4326'}
demog.head(2)
## Residual table
relevant_items = ['the_geom', 'geometry', 'postcode', 'name', 'no_churn_clients', 'no_churn_clients_rel']
residuals = demog[relevant_items]
ds = Dataset(residuals)
ds.upload(table_name='edp_churn_residuals', if_exists='replace', credentials=cred)
residual_query = """
SELECT
*,
ST_AsText(the_geom) as geometry
FROM edp_churn_residuals
"""
# In order to characterize post codes better, we use the historical churn_rate
df_final = df_final.merge(df_demog[['postcode', 'no_churn_clients_rel']], how='left', left_on='cp_main', right_on='postcode').drop('postcode', axis=1)
df_final.head(2)
Here we normalize:
df_final['active_accounts_rel'] = df_final['no_active_accounts'] / df_final['id_conta_rgpd_count']
df_final['active_contracts_rel'] = df_final['no_active_contracts'] / df_final['id_contrato_rgpd_count']
df_final['apartamento_rel'] = df_final['apartamento'] / (df_final['apartamento'] + df_final['moradia'] + df_final['other_dwellings'])
df_final['moradia_rel'] = df_final['moradia'] / (df_final['apartamento'] + df_final['moradia'] + df_final['other_dwellings'])
df_final['other_dwellings_rel'] = df_final['other_dwellings'] / (df_final['apartamento'] + df_final['moradia'] + df_final['other_dwellings'])
# Watch out! There are clients registered in zip codes not included in the sociodem data. For now, we filter them.
df_final = df_final[df_final['p_t'] > 0]
df_final['age_t0014'] = df_final['age_t0014'] / df_final['p_t']
df_final['age_t1529'] = df_final['age_t1529'] / df_final['p_t']
df_final['age_t3044'] = df_final['age_t3044'] / df_final['p_t']
df_final['age_t4559'] = df_final['age_t4559'] / df_final['p_t']
df_final['age_t60pl'] = df_final['age_t60pl'] / df_final['p_t']
df_final['lifetime_global_coeff'] = df_final['life_time_days'] / (df_final['lt_global_pop_mean_6m']+1)
df_final['lifetime_local_churn_coeff'] = df_final['life_time_days'] / (df_final['lt_local_churn_mean_12m']+1)
#df_final['lifetime_local_coeff'] = df_final['life_time_days'] / (df_final['lt_local_pop_mean_12m']+1)
df_final['lifetime_coeff'] = df_final['life_time_days'] / (df_final['median_life_time_days_cp_main']+1)
df_final['qtd_consumo_anual_E_12m_coeff'] = df_final['median_E'] / (df_final['cp_qtd_consumo_anual_esp_E_12m']+1)
df_final['qtd_consumo_anual_G_12m_coeff'] = df_final['median_G'] / (df_final['cp_qtd_consumo_anual_esp_G_12m']+1)
df_final['qtd_consumo_anual_coeff'] = (df_final['mean_E'] * df_final['e_count'] + df_final['mean_G'] * df_final['g_count']) /\
(df_final['qtd_consumo_anual_esp_cp_main']+1)
Some continuous (integer) variables have very large values. We'll assign them the next value of their 0.98 quantile.
interest_feats = ['id_conta_rgpd_count', 'id_contrato_rgpd_count', 'e_count', 'g_count', 's_count', 'eg_count',
'no_active_accounts', 'no_active_contracts', 'no_products', 'apartamento', 'moradia', 'other_dwellings']
quantile = 0.98
for feature in interest_feats:
quant_val = df_final[feature].quantile(quantile)
print(feature, quant_val)
df_final.loc[df_final[feature] > quant_val + 1, feature] = quant_val + 1
We classify variables by their type for further treatment.
#Target column
target_var = 'churn'
#categorical columns
cat_vars = ['consent_data', 'main_postal_region', 'main_urban_type', 'start_month']
#Binary columns with 2 values
bin_vars = ['consent_data', 'flg_deb_direto ', 'flg_fatura_eletronica', 'flg_edp_online', 'flg_cod_produto_key', 'solar']
#numerical columns
num_vars = [x for x in df_final.loc[:, 'id_conta_rgpd_count':].columns.values.tolist() if x not in cat_vars + bin_vars]
df_final[num_vars].describe()
We identified some negative values, and some very extreme ones in expected consumption.
If we take a closer look at extreme values, we can see they are especially highly concentrated in three regions (see map below), but for less than 5% of the zip codes extreme account for more than 1% of the customers there.
Given this, we decided to discard:
metric = 'mean_E'
plt.figure(figsize=(12,5))
sns.distplot(df_final.loc[df_final[metric] < df_final[metric].quantile(0.995), metric])
Let's see if they are greographically concentrated.
condition = df_final[metric] >= df_final[metric].quantile(0.995)
#condition = df_final[metric] < 0
outlier_concentration = demog[['geometry', 'postcode', 'name', 'no_clients']].\
merge(df_final[condition].\
groupby('cp_main').agg({'id_cliente_rgpd':'count'}).\
reset_index().rename(columns={'id_cliente_rgpd':'outlier_count'}),
how='left', left_on='postcode', right_on='cp_main').fillna(0).drop('cp_main', axis=1)
outlier_concentration['outlier_count_rel'] = outlier_concentration['outlier_count'] / (outlier_concentration['no_clients']+1)
ds = Dataset(outlier_concentration)
ds.upload(table_name='edp_outlier_concentration', if_exists='replace', credentials=cred)
category_attribute = 'outlier_count_rel' # outlier_count
Map(
color_bins_layer('edp_outlier_concentration', category_attribute, 'Outlier count')
)
Filtering is applied both based on electricity and gas consumption:
df_final = df_final[(df_final['min_E'] >= 0) & (df_final['max_E'] < df_final['max_E'].quantile(0.995))]
df_final = df_final[(df_final['min_G'] >= 0) & (df_final['max_G'] < df_final['max_G'].quantile(0.995))]
one_hot_cat = pd.get_dummies(data = df_final[cat_vars],columns = cat_vars)
one_hot_cat.head(2)
df_final = pd.concat([df_final, one_hot_cat], axis=1)
df_final.head(2)
We take a first look at the correlation matrix. First, considering all historical data, afterwards only considering churn happening after a certain date, and finally considering clients joining the company after a certain date.
We can see churn is weakly correlated to most of the features except for those related to life time moving average at different time and spatial levels. This shows we have to be extremely careful with features regarding life time as it is strongly time-dependent.
Zooming on high correlation areas of the corr matrix, we can identify some highly correlated variables. We should be careful with these variables so that they don't introduce noise to the model we train.
non_spatial_features = ['id_conta_rgpd_count', 'id_contrato_rgpd_count', 'e_count', 'g_count',
's_count', 'eg_count', 'life_time_days', 'lifetime_global_coeff',
'no_active_accounts', 'no_active_contracts', 'no_products',
'consent_data', 'apartamento',
'moradia', 'other_dwellings', 'flg_deb_direto', 'flg_fatura_eletronica',
'flg_edp_online', 'mean_E', 'mean_G', 'median_E', 'median_G', 'min_E',
'min_G', 'max_E', 'max_G', 'start_month', 'flg_cod_produto_key',
'time_since_last_upgrade',
'time_since_last_downgrade', 'solar', 'active_accounts_rel',
'active_contracts_rel', 'apartamento_rel', 'moradia_rel',
'other_dwellings_rel']
plot_corr_matrix(df_final.sample(100000), ['churn'] + non_spatial_features, 'corr - non-spatial features')
item_list = ['churn'] + df_final.columns[5:75].tolist()
plot_corr_matrix(df_final.sample(100000), item_list, 'corr')
Let's now take a look considering only customers who churned after 201701 or are still active.
plot_corr_matrix(df_final[df_final['year_month'] > 201701].sample(35000), item_list, 'corr 201701 - ')
Churn is highly correlated with featres regarding time-window life time values and with the feature flg_cod_produto_key which identifies if the client has an E/G product different from "Electricidade"/"Gas".
We must be extremely careful with these features because as we saw in our first analysis (summarized in edp_data_summary.ipynp) all of them depend heavily on time: lifetime increases continuously over time, and there are no new clients with E/G contracts different from "Electricity"/"Gas".
# Churn's strongest correlations
for item in item_list:
corr_coef = np.corrcoef(df_final.loc[df_final['year_month_start'] > 201201, 'churn'],
df_final.loc[df_final['year_month_start'] > 201201, item])[0, 1]
if np.abs(corr_coef) > 0.4:
print(item, corr_coef)
Let's zoom in over those areas of the corr matrix with higher correlation values.
Summary:
item_list = ['id_conta_rgpd_count', 'id_contrato_rgpd_count', 'e_count', 'g_count',
's_count', 'eg_count', 'no_active_accounts', 'no_active_contracts', 'no_products',
'active_accounts_rel', 'active_contracts_rel']
plot_corr_matrix(df_final[df_final['year_month_start'] > 201201].sample(35000), item_list, 'corr')
We'll start working with features obtained from EDP's historical data, and will increasingly add spatial features to measure the improvement in performance.
Class inbalance
There is a 86-14 class imbalance in our target variable. We will balance this variable downsampling non-churners.
Performance metric
Even though throughout the model building process we evaluate different metrics, our focus metric will be recall. This is because not identifying a churner properly and losing the client is more costly than identifying a non-churner as churner. In particular, we especially care about recall in the last month of the prediction (in this case January 2019).
Time dependent variables
On all the analyses done, we have seen churn patterns have changed through time. For example, while in 2016/2017 flg_cod_produto_key was a very good predictor, it is no longer the case. We should be extremely careful with this. That is why we decided to limit the data used to train the model, hence the two parameters below:
starting_churn_date_cut = 201601
starting_join_date_cut = 201201
We'll start by training an XGBClassifier model with default hyperparameters, and the features in non_spatial_features.
Summary of results:
client_features = ['id_cliente_rgpd', 'churn', 'cp_main', 'year_month_start', 'year_month']
df_final_f = df_final.loc[(df_final['year_month'] > starting_churn_date_cut) & (df_final['year_month_start'] >= starting_join_date_cut), client_features + non_spatial_features]
sample = df_final_f[~df_final_f['churn']].sample(df_final_f[df_final_f['churn']].shape[0], random_state=111)
sample = pd.concat([sample, df_final_f[df_final_f['churn']]])
sample.shape
features = non_spatial_features
X_train, X_test, y_train, y_test = train_test_split(sample.loc[:, features], sample['churn'].astype(int),
test_size=0.3, random_state=7)
model = XGBClassifier(random_state = 111)
model.fit(X_train, y_train)
calculate_performance_metrics(model, sample, X_train, X_test, y_train, y_test, 0, 'recall_0')
importances = pd.DataFrame(model.feature_importances_, index=sample.loc[:, features].columns, columns=['importance']).sort_values('importance', ascending=False)
plt.figure(figsize=(16,6))
sns.barplot(x='importance', y='index', data=importances.reset_index()[:25], color='blue')
#plt.title('Feature importance', size=14)
The figure below shows the big change in flg_cod_produto_key during 2018.
plt.figure(figsize=(14,4))
flg_cod_produto_key_monthly = sample.groupby('year_month').agg({'flg_cod_produto_key':'mean'}).reset_index()
sns.lineplot(x = flg_cod_produto_key_monthly['year_month'].astype(str), y=flg_cod_produto_key_monthly['flg_cod_produto_key'])
plt.title('Average flg_cod_produto_key per churning month', fontsize=15, color='black', weight='medium')
plt.xticks(rotation='vertical', fontsize='x-small')
plt.grid(axis='x')
fig, axs = plt.subplots(1, 2, figsize=(18,4))
lt_monthly = sample[sample['churn']].groupby('year_month').agg({'life_time_days':'mean'}).reset_index()
sns.lineplot(x = lt_monthly['year_month'].astype(str), y=lt_monthly['life_time_days'], ax=axs[0])
axs[0].set_title('Average lifetime per churning month', fontsize=15, color='black', weight='medium')
axs[0].tick_params(labelrotation=90, size=1)
axs[0].grid(axis='x')
lt_coeff_monthly = sample[sample['churn']].groupby('year_month').agg({'lifetime_global_coeff':'mean'}).reset_index()
sns.lineplot(x = lt_coeff_monthly['year_month'].astype(str), y=lt_coeff_monthly['lifetime_global_coeff'], ax=axs[1])
axs[1].set_title('Average lifetime coeff per churning month', fontsize=15, color='black', weight='medium')
axs[1].tick_params(labelrotation=90, size=1)
axs[1].grid(axis='x')
On this iteration, we'll focus on correcting the time performance time dependency by giving priority to recent observations of churn. We'll do this by sampling a decreasing number of churners as we go further backwards in time.
In addition, we detrend or remove those features with a high time dependence.
Summary of results:
df_final_f = df_final.loc[(df_final['year_month'] > starting_churn_date_cut) & (df_final['year_month_start'] >= starting_join_date_cut), client_features + non_spatial_features]
ending_periods = df_final_f['year_month'].sort_values(ascending=False).unique()
sample_size_f = 12
replace = sample_size_f > 1
sample = df_final_f[(df_final_f['churn']) & (df_final_f['year_month'] == ending_periods[0])].sample(frac=sample_size_f, replace=replace, random_state=111)
for i in range(1, len(ending_periods)):
sample_size_f /= 2
if sample_size_f < 0.05:
sample_size_f = 0.05
replace = sample_size_f > 1
sample = pd.concat([sample, df_final_f[(df_final_f['churn']) & (df_final_f['year_month'] == ending_periods[i])].sample(frac=sample_size_f, replace=replace, random_state=111)])
sample = pd.concat([sample, df_final_f[~df_final_f['churn']].sample(sample.shape[0], random_state=111)])
sample.shape
features = ['id_conta_rgpd_count', 'id_contrato_rgpd_count', 'e_count', 'g_count',
's_count', 'eg_count', 'lifetime_global_coeff',
'no_active_accounts', 'no_active_contracts', 'no_products',
'consent_data', 'apartamento',
'moradia', 'other_dwellings', 'flg_deb_direto', 'flg_fatura_eletronica',
'flg_edp_online', 'mean_E', 'mean_G', 'start_month',
'time_since_last_upgrade',
'time_since_last_downgrade', 'solar',
'active_contracts_rel', 'apartamento_rel', 'moradia_rel',
'other_dwellings_rel']
X_train, X_test, y_train, y_test = train_test_split(sample.loc[:, features], sample['churn'].astype(int),
test_size=0.3, random_state=7)
model = XGBClassifier(random_state = 111)
model.fit(X_train, y_train)
calculate_performance_metrics(model, sample, X_train, X_test, y_train, y_test, 2, 'recall_2')
importances = pd.DataFrame(model.feature_importances_, index=sample.loc[:, features].columns, columns=['importance']).sort_values('importance', ascending=False)
plt.figure(figsize=(16,6))
sns.barplot(x='importance', y='index', data=importances.reset_index()[:25], color='blue')
plt.title('Feature importance', size=14)
Let's visualize the model's performance in the different zip codes.
# RESIDUAL SPATIAL AUTOCORRELATION
residual_metric = 'recall_2'
Map(
Layer('edp_churn_residuals', '''
color: ramp(buckets(${}, [0.65, 0.8, 0.9]), Emrld)
strokeWidth: 0.4
strokeColor: opacity(blue,0.2)
order: asc(width())
'''.format(residual_metric),
popup={
'hover': [{
'title': 'Recall',
'value': '$recall_2'
},{
'title': 'Precision',
'value': '$precision_2'
},{
'title': 'Accuracy',
'value': '$accuracy_2'
},
{
'title': 'zip code',
'value': '$postcode'
}]
},
legend={
'title':'{}'.format(residual_metric),
'type': 'color-continuous-polygon',
'prop': 'color'
}),
viewport={
'zoom': 5.75,
'lat': 39.5129,
'lng': -8.0977
},
basemap = basemaps.darkmatter
)
On this iteration, we introduce spatial features and perform feature selection.
Summary of results:
new_features = ['hh_t', 'hh_size', 'age_t0014', 'age_t1529', 'age_t3044', 'age_t4559',
'age_t60pl', 'pp_ci', 'main_postal_region_1', 'main_postal_region_2',
'main_postal_region_3', 'main_postal_region_4', 'main_postal_region_5',
'main_postal_region_6', 'main_postal_region_7', 'main_postal_region_8',
'main_postal_region_9', 'main_urban_type_0', 'main_urban_type_1', 'pop_density_cp_main',
'churn_rate_6m', 'churn_trend_6m', 'cp_qtd_consumo_anual_esp_E_12m',
'cp_qtd_consumo_anual_esp_G_12m', '1m_churn_rate_fullzc', '1m_churn_fullzc',
'lifetime_local_churn_coeff', 'qtd_consumo_anual_coeff', 'no_churn_clients_rel',
'no_clients_cp_main', 'no_clients_rel_cp_main',
'qtd_consumo_anual_esp_cp_main']
df_final_f = df_final.loc[(df_final['year_month'] > starting_churn_date_cut) & (df_final['year_month_start'] >= starting_join_date_cut), client_features + non_spatial_features + new_features]
ending_periods = df_final_f['year_month'].sort_values(ascending=False).unique()
sample_size_f = 12
replace = sample_size_f > 1
sample = df_final_f[(df_final_f['churn']) & (df_final_f['year_month'] == ending_periods[0])].sample(frac=sample_size_f, replace=replace, random_state=111)
for i in range(1, len(ending_periods)):
sample_size_f /= 2
if sample_size_f < 0.05:
sample_size_f = 0.05
replace = sample_size_f > 1
sample = pd.concat([sample, df_final_f[(df_final_f['churn']) & (df_final_f['year_month'] == ending_periods[i])].sample(frac=sample_size_f, replace=replace, random_state=111)])
sample = pd.concat([sample, df_final_f[~df_final_f['churn']].sample(sample.shape[0], random_state=111)])
sample.shape
features = ['id_conta_rgpd_count', 'id_contrato_rgpd_count', 'e_count', 'g_count',
's_count', 'eg_count', 'lifetime_global_coeff',
'no_active_accounts', 'no_active_contracts', 'no_products',
'consent_data', 'apartamento',
'moradia', 'other_dwellings', 'flg_deb_direto', 'flg_fatura_eletronica',
'flg_edp_online', 'mean_E', 'mean_G', 'start_month',
'time_since_last_upgrade',
'time_since_last_downgrade', 'solar',
'active_contracts_rel', 'apartamento_rel', 'moradia_rel',
'other_dwellings_rel'] + new_features
At this point, the number of feature is quite high and they may introduce noise. Before training the model, we perfom a recursive feature elimination. First, iterating 3 times over a RFECV process, and finally over a RFE indicating we´d like to keep 25 features, which seems a reasonable amount to later being able to have explainable results.
Spatial features selected in the process:
features = ['id_conta_rgpd_count', 'id_contrato_rgpd_count', 'e_count', 's_count', 'eg_count', 'lifetime_global_coeff', 'no_active_accounts', 'no_active_contracts',
'no_products', 'moradia', 'flg_deb_direto', 'flg_fatura_eletronica', 'mean_E', 'time_since_last_upgrade', 'time_since_last_downgrade', 'active_contracts_rel',
'apartamento_rel', 'moradia_rel', 'hh_size', 'pop_density_cp_main', 'churn_rate_6m', 'churn_trend_6m', 'cp_qtd_consumo_anual_esp_E_12m',
'lifetime_local_churn_coeff', 'no_churn_clients_rel']
'''rfecv_sample = sample.sample(int(np.ceil(sample.shape[0] * 0.1)))'''
'''selector = RFECV(RandomForestClassifier(max_depth=6, random_state=111, n_estimators=100, min_samples_leaf = 5), step=1, cv=3, scoring='accuracy', n_jobs=8)
selector = selector.fit(rfecv_sample.loc[:, features], rfecv_sample['churn'].astype(int))'''
'''plt.figure(figsize=(12,5))
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (accuracy)")
plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)'''
'''selector.n_features_'''
'''# This final set of features was obtained after 3 iterations over previous selector
features = np.array(features)[selector.support_].tolist()'''
'''selector = RFE(RandomForestClassifier(max_depth=6, random_state=111, n_estimators=100, min_samples_leaf = 5), step=1, n_features_to_select=25)
selector = selector.fit(rfecv_sample.loc[:, features], rfecv_sample['churn'].astype(int))'''
'''prev_features = features'''
'''features = np.array(features)[selector.support_].tolist()'''
X_train, X_test, y_train, y_test = train_test_split(sample.loc[:, features], sample['churn'].astype(int),
test_size=0.3, random_state=7)
model = XGBClassifier(random_state = 111)
model.fit(X_train, y_train)
calculate_performance_metrics(model, sample, X_train, X_test, y_train, y_test, 5, 'recall_5')
importances = pd.DataFrame(model.feature_importances_, index=sample.loc[:, features].columns, columns=['importance']).sort_values('importance', ascending=False)
plt.figure(figsize=(16,6))
sns.barplot(x='importance', y='index', data=importances.reset_index()[:25], color='blue')
plt.title('Feature importance', size=14)
Let's visualize the model's performance in the different zip codes.
# RESIDUAL SPATIAL AUTOCORRELATION
residual_metric = 'recall_5'
#color: ramp(globalQuantiles(${}, 4), Emrld)
Map(
Layer('edp_churn_residuals', '''
color: ramp(buckets(${}, [0.65, 0.8, 0.9]), Emrld)
strokeWidth: 0.4
strokeColor: opacity(blue,0.2)
order: asc(width())
'''.format(residual_metric),
popup={
'hover': [{
'title': 'Recall',
'value': '$recall_5'
},{
'title': 'Precision',
'value': '$precision_5'
},{
'title': 'Accuracy',
'value': '$accuracy_5'
},
{
'title': 'zip code',
'value': '$postcode'
}]
},
legend={
'title':'{}'.format(residual_metric),
'type': 'color-continuous-polygon',
'prop': 'color'
}),
viewport={
'zoom': 5.75,
'lat': 39.5129,
'lng': -8.0977
},
basemap = basemaps.darkmatter
)
On this iteration we optimize our algorithm performing some hyperparameter tuning.
Summary of results:
df_final_f = df_final.loc[(df_final['year_month'] > starting_churn_date_cut) & (df_final['year_month_start'] >= starting_join_date_cut), client_features + non_spatial_features + new_features]
ending_periods = df_final_f['year_month'].sort_values(ascending=False).unique()
sample_size_f = 16
replace = sample_size_f > 1
sample = df_final_f[(df_final_f['churn']) & (df_final_f['year_month'] == ending_periods[0])].sample(frac=sample_size_f, replace=replace, random_state=111)
for i in range(1, len(ending_periods)):
sample_size_f /= 2
if sample_size_f < 0.025:
sample_size_f = 0.025
replace = sample_size_f > 1
sample = pd.concat([sample, df_final_f[(df_final_f['churn']) & (df_final_f['year_month'] == ending_periods[i])].sample(frac=sample_size_f, replace=replace, random_state=111)])
'''initial_periods = df_final_f['year_month_start'].unique()
no_initial_periods = len(initial_periods)
sample_size = int(np.ceil(sample.shape[0] / no_initial_periods))
for initial_period in initial_periods:
sample = pd.concat([sample, df_final_f[(~df_final_f['churn']) & (df_final_f['year_month_start'] == initial_period)].sample(sample_size, replace=True, random_state=111)])'''
sample = pd.concat([sample, df_final_f[~df_final_f['churn']].sample(sample.shape[0], random_state=111)])
sample.shape
opt_sample = sample.sample(int(np.ceil(sample.shape[0] * 0.1)))
param_range = [2, 3, 4, 5]
viz = ValidationCurve(
XGBClassifier(random_state = 111), param_name="max_depth", param_range=param_range,
cv=3, scoring="accuracy", n_jobs=8,
)
viz.fit(opt_sample.loc[:, features], opt_sample['churn'].astype(int))
viz.poof()
param_range = [0.01, 0.05, 0.1, 0.15, 0.2, 0.25]
viz = ValidationCurve(
XGBClassifier(max_depth=3, random_state = 111), param_name="learning_rate", param_range=param_range,
cv=3, scoring="accuracy", n_jobs=8,
)
viz.fit(opt_sample.loc[:, features], opt_sample['churn'].astype(int))
viz.poof()
param_range = [50, 100, 150, 200, 300]
viz = ValidationCurve(
XGBClassifier(max_depth=4, learning_rate=0.15, random_state = 111), param_name="n_estimators", param_range=param_range,
cv=3, scoring="accuracy", n_jobs=8,
)
viz.fit(opt_sample.loc[:, features], opt_sample['churn'].astype(int))
viz.poof()
X_train, X_test, y_train, y_test = train_test_split(sample.loc[:, features], sample['churn'].astype(int),
test_size=0.3, random_state=7)
model = XGBClassifier(random_state = 111, max_depth=3, learning_rate=0.15, n_estimators=100, base_score=0.5)
model.fit(X_train, y_train)
calculate_performance_metrics(model, sample, X_train, X_test, y_train, y_test, 6, 'recall_6')
X = X_test
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X, plot_type="bar")
shap.summary_plot(shap_values, X)
Let's visualize the model's performance in the different zip codes.
# RESIDUAL SPATIAL AUTOCORRELATION
residual_metric = 'recall_6'
#color: ramp(globalQuantiles(${}, 4), Emrld)
Map(
Layer('edp_churn_residuals', '''
color: ramp(buckets(${}, [0.65, 0.8, 0.9]), Emrld)
strokeWidth: 0.4
strokeColor: opacity(blue,0.2)
order: asc(width())
'''.format(residual_metric),
popup={
'hover': [{
'title': 'Recall',
'value': '$recall_6'
},{
'title': 'Precision',
'value': '$precision_6'
},{
'title': 'Accuracy',
'value': '$accuracy_6'
},
{
'title': 'zip code',
'value': '$postcode'
}]
},
legend={
'title':'{}'.format(residual_metric),
'type': 'color-continuous-polygon',
'prop': 'color'
}),
viewport={
'zoom': 5.75,
'lat': 39.5129,
'lng': -8.0977
},
basemap = basemaps.darkmatter
)
Here, we focus on two improvements idientified throughout the analysis related to spatial components:
On the las result obtained from the optimized algorithm, we have a Moran's I value of 0.43 for the recall. Visualizing these data we can see low recall areas are clustered.
On the other hand, low churn rate zip codes are also clustered (see map below).
If we analyze the relationship between these two variables, we can see that recall is high and robust (low variance) in zip codes with a high churn rate, while it is lower and with a higher variance in zip codes with lower churn rate. Since the total average churn rate of a zip code is one of the top 5 most important features, this is one of the reasons for having cluster areas of low recall.
prediction = model.predict(X_test)
accuracy_cp = calculate_metric(sample, X_test, y_test, prediction, 'cp_main', 'accuracy', np.full((X_test.shape[0],), True), y_test == prediction)
accuracy_cp = accuracy_cp.merge(demog[['postcode', 'no_churn_clients_rel']], how='left', left_on='cp_main', right_on='postcode')
plt.figure(figsize=(12,7))
sns.regplot(accuracy_cp['no_churn_clients_rel'], accuracy_cp['accuracy'])
plt.hlines(0.65, 0.051, 0.225, 'red', '--')
Map(
Layer('edp_cp_demographics_agg', '''
color: ramp(buckets($no_churn_clients_rel, [0.091980, 0.125496, 0.157762]), Emrld)
strokeWidth: 0.4
strokeColor: opacity(blue,0.2)
order: asc(width())
''',
popup={
'hover': [
{
'title': 'zip code',
'value': '$postcode'
}]
},
legend={
'title':'{}'.format(residual_metric),
'type': 'color-continuous-polygon',
'prop': 'color'
}),
viewport={
'zoom': 5.75,
'lat': 39.5129,
'lng': -8.0977
},
basemap = basemaps.darkmatter
)
Throughout the process of building the model, we have seen a consistent lower recall in the Southern part of Portugal (among other areas). This is because churn patterns in this region are different to other parts of Portugal, probably due to a higher share of second residences.
Here we build a model for the region of Algarve (postal codes starting by 8) and see that we get much better results than with the general model. Taking a close look at the results, we can see that the main differences wrt the general model are:
A higher number of accounts (active and non-active) implies higher churn. This is not the case in the general model.
As a result, we go from a performance of 0.73 in accuracy and 0.82 in AUC with the general model, to an accuracy of 0.83 and an AUC of 0.92 with the specialized model (further details below).
algarve_sample = sample[sample['cp_main'].astype(str).str[0] == '8']
algarve_prediction = model.predict(algarve_sample[features])
algarve_probabilities = model.predict_proba(algarve_sample[features])
# METRIC SUMMARY
print('*** METRIC SUMMARY ***')
print(classification_report(algarve_sample['churn'].astype(int), algarve_prediction))
print('accuracy', accuracy_score(algarve_sample['churn'].astype(int), algarve_prediction))
print('precision', precision_score(algarve_sample['churn'].astype(int), algarve_prediction))
print('recall', recall_score(algarve_sample['churn'].astype(int), algarve_prediction))
print('f1 score', f1_score(algarve_sample['churn'].astype(int), algarve_prediction))
fpr, tpr, _ = roc_curve(algarve_sample['churn'].astype(int), algarve_probabilities[:,1])
roc_auc = auc(fpr, tpr)
print('ROC_AUC:', roc_auc)
reg = '8'
df_final_reg = df_final.loc[(df_final['year_month'] > starting_churn_date_cut) & (df_final['year_month_start'] >= starting_join_date_cut) & (df_final['cp_main'].astype(str).str[0] == reg),
client_features + non_spatial_features + new_features]
ending_periods = df_final_reg['year_month'].sort_values(ascending=False).unique()
sample_size_f = 16
replace = sample_size_f > 1
sample = df_final_reg[(df_final_reg['churn']) & (df_final_reg['year_month'] == ending_periods[0])].sample(frac=sample_size_f, replace=replace, random_state=111)
for i in range(1, len(ending_periods)):
sample_size_f /= 2
if sample_size_f < 0.025:
sample_size_f = 0.025
replace = sample_size_f > 1
sample = pd.concat([sample, df_final_reg[(df_final_reg['churn']) & (df_final_reg['year_month'] == ending_periods[i])].sample(frac=sample_size_f, replace=replace, random_state=111)])
'''postal_cpdes = df_final_reg['cp_main'].unique()
for postal_code in postal_cpdes:
sample = pd.concat([sample, df_final_reg[(~df_final_reg['churn']) & (df_final_reg['cp_main'] == postal_code)].sample(sample[sample['cp_main'] == postal_code].shape[0], random_state=111)])'''
sample = pd.concat([sample, df_final_reg[~df_final_reg['churn']].sample(sample.shape[0], random_state=111)])
sample.shape
X_train, X_test, y_train, y_test = train_test_split(sample.loc[:, features], sample['churn'].astype(int),
test_size=0.3, random_state=7)
model_algarve = XGBClassifier(random_state = 111, max_depth=3, learning_rate=0.15, n_estimators=100, base_score=0.5)
model_algarve.fit(X_train, y_train)
calculate_performance_metrics(model_algarve, sample, X_train, X_test, y_train, y_test, 7, 'recall_7')
X_a = X_test
explainer = shap.TreeExplainer(model_algarve)
shap_values = explainer.shap_values(X_a)
shap.summary_plot(shap_values, X_a, plot_type="bar")
shap.summary_plot(shap_values, X_a)
Once the model is trained, the following step would be to run it every month to identify those clients at risk of churning and for which the company can still do something to retain them.
The model we have trained provides with a churning score we can use to classify clients based on this score. We could for example classify the in Low, Mid, and High churn risk, so that they can focus their actions depending on this classification.
In addition, it is very important to track the model's performace through time to know when it should be retrained. Two different policies may be followed:
This retraining is very important as churn patterns will change over time. We have actually identified to importante patterns in our data:
model.score(df_final.loc[df_final['year_month'] == 201901, features], df_final.loc[df_final['year_month'] == 201901, 'churn'].astype(int))
final_prediction = model.predict(df_final.loc[df_final['year_month'] == 201901, features])
final_probabilities = model.predict_proba(df_final.loc[df_final['year_month'] == 201901, features])
# METRIC SUMMARY
print('*** METRIC SUMMARY ***')
print(classification_report(df_final.loc[df_final['year_month'] == 201901, 'churn'].astype(int), final_prediction))
print('accuracy', accuracy_score(df_final.loc[df_final['year_month'] == 201901, 'churn'].astype(int), final_prediction))
print('precision', precision_score(df_final.loc[df_final['year_month'] == 201901, 'churn'].astype(int), final_prediction))
print('recall', recall_score(df_final.loc[df_final['year_month'] == 201901, 'churn'].astype(int), final_prediction))
print('f1 score', f1_score(df_final.loc[df_final['year_month'] == 201901, 'churn'].astype(int), final_prediction))
fpr, tpr, _ = roc_curve(df_final.loc[df_final['year_month'] == 201901, 'churn'].astype(int), final_probabilities[:,1])
roc_auc = auc(fpr, tpr)
print('ROC_AUC:', roc_auc)
plot_roc_curve(fpr, tpr, roc_auc)
plt.show()
Apart from understanding at a global level what the factors driving churn are, the techniques used allow us to understand at a client level why they churn.
Below, two examples are shown (a churner, and a non-churner) where we can see the main factors which made the algorithm classify them as churner and non-churner, respectively.
This information is very powerful because it allows them design tailored client retention strategies.
Here we can see a customer for whom the model predicts he/she will churn. On the figure below we can see the two main factors for identifying this customer as a churner are:
The rest of the predictors are weak in determining the churn risk os this client.
shap.initjs()
idx_churner = 23
shap.force_plot(explainer.expected_value, shap_values[idx_churner,:], X.iloc[idx_churner,:])
# Client characteristics
df_final.loc[2839767].to_frame().transpose()
In this case, we can see a customer for whom the model predicts he/she will not churn. On the figure below we can see the two main factors for identifying this customer as a non-churner are:
shap.initjs()
idx_non_churner = 27
shap.force_plot(explainer.expected_value, shap_values[idx_non_churner,:], X.iloc[idx_non_churner,:])
# Client characteristics
df_final.loc[1580680].to_frame().transpose()
df_feb = load_data('/gdrive/My Drive/final_dataframe_201902.csv', cat_vars, df_demog)
df_feb.head(2)
model.score(df_feb.loc[df_feb['year_month'] == 201902, features], df_feb.loc[df_feb['year_month'] == 201902, 'churn'].astype(int))
final_prediction = model.predict(df_feb.loc[df_feb['year_month'] == 201902, features])
final_probabilities = model.predict_proba(df_feb.loc[df_feb['year_month'] == 201902, features])
# METRIC SUMMARY
print('*** METRIC SUMMARY ***')
print(classification_report(df_feb.loc[df_feb['year_month'] == 201902, 'churn'].astype(int), final_prediction))
print('accuracy', accuracy_score(df_feb.loc[df_feb['year_month'] == 201902, 'churn'].astype(int), final_prediction))
print('precision', precision_score(df_feb.loc[df_feb['year_month'] == 201902, 'churn'].astype(int), final_prediction))
print('recall', recall_score(df_feb.loc[df_feb['year_month'] == 201902, 'churn'].astype(int), final_prediction))
print('f1 score', f1_score(df_feb.loc[df_feb['year_month'] == 201902, 'churn'].astype(int), final_prediction))
fpr, tpr, _ = roc_curve(df_feb.loc[df_feb['year_month'] == 201902, 'churn'].astype(int), final_probabilities[:,1])
roc_auc = auc(fpr, tpr)
print('ROC_AUC:', roc_auc)
plot_roc_curve(fpr, tpr, roc_auc)
plt.show()
df_mar = load_data('/gdrive/My Drive/final_dataframe_201903.csv', cat_vars, df_demog)
df_mar.head(2)
model.score(df_mar.loc[df_mar['year_month'] == 201903, features], df_mar.loc[df_mar['year_month'] == 201903, 'churn'].astype(int))
final_prediction = model.predict(df_mar.loc[df_mar['year_month'] == 201903, features])
final_probabilities = model.predict_proba(df_mar.loc[df_mar['year_month'] == 201903, features])
# METRIC SUMMARY
print('*** METRIC SUMMARY ***')
print(classification_report(df_mar.loc[df_mar['year_month'] == 201903, 'churn'].astype(int), final_prediction))
print('accuracy', accuracy_score(df_mar.loc[df_mar['year_month'] == 201903, 'churn'].astype(int), final_prediction))
print('precision', precision_score(df_mar.loc[df_mar['year_month'] == 201903, 'churn'].astype(int), final_prediction))
print('recall', recall_score(df_mar.loc[df_mar['year_month'] == 201903, 'churn'].astype(int), final_prediction))
print('f1 score', f1_score(df_mar.loc[df_mar['year_month'] == 201903, 'churn'].astype(int), final_prediction))
fpr, tpr, _ = roc_curve(df_mar.loc[df_mar['year_month'] == 201903, 'churn'].astype(int), final_probabilities[:,1])
roc_auc = auc(fpr, tpr)
print('ROC_AUC:', roc_auc)
plot_roc_curve(fpr, tpr, roc_auc)
plt.show()